# Interface for database communication in R
library(DBI)
## Warning: package 'DBI' was built under R version 4.3.3
# Enables connections via ODBC drivers
library(odbc)
## Warning: package 'odbc' was built under R version 4.3.3
con <- dbConnect(odbc(),
Driver = "ODBC Driver 17 for SQL Server",
Server = "Ana",
Database = "Book",
Trusted_Connection = "Yes",
Port = 1433)
# Disconnect when done
#dbDisconnect(con)
# Load the books table
books <- dbReadTable(con, "books")
# 1. Display the first 5 rows
head(books, 5)
## id book_id work_id books_count isbn isbn13
## 1 1 2767052 2792775 272 439023483 9.780438e+12
## 2 2 3 4640799 491 439554934 9.780440e+12
## 3 3 41865 3212258 226 316015849 9.780316e+12
## 4 4 2657 3275794 487 61120081 9.780061e+12
## 5 5 4671 245494 1356 743273567 9.780744e+12
## authors original_publication_year
## 1 Suzanne Collins 2008
## 2 J.K. Rowling, Mary GrandPré 1997
## 3 Stephenie Meyer 2005
## 4 Harper Lee 1960
## 5 F. Scott Fitzgerald 1925
## original_title
## 1 the hunger games
## 2 harry potter and the philosopher's stone
## 3 twilight
## 4 to kill a mockingbird
## 5 the great gatsby
## title language_code
## 1 The Hunger Games (The Hunger Games, #1) eng
## 2 Harry Potter and the Sorcerer's Stone (Harry Potter, #1) eng
## 3 Twilight (Twilight, #1) en-US
## 4 To Kill a Mockingbird eng
## 5 The Great Gatsby eng
## average_rating ratings_count work_ratings_count work_text_reviews_count
## 1 4.34 4780653 4942365 155254
## 2 4.44 4602479 4800065 75867
## 3 3.57 3866839 3916824 95009
## 4 4.25 3198671 3340896 72586
## 5 3.89 2683664 2773745 51992
## ratings_1 ratings_2 ratings_3 ratings_4 ratings_5
## 1 66715 127936 560092 1481305 2706317
## 2 75504 101676 455024 1156318 3011543
## 3 456191 436802 793319 875073 1355439
## 4 60427 117415 446835 1001952 1714267
## 5 86236 197621 606158 936012 947718
## image_url
## 1 https://images.gr-assets.com/books/1447303603m/2767052.jpg
## 2 https://images.gr-assets.com/books/1474154022m/3.jpg
## 3 https://images.gr-assets.com/books/1361039443m/41865.jpg
## 4 https://images.gr-assets.com/books/1361975680m/2657.jpg
## 5 https://images.gr-assets.com/books/1490528560m/4671.jpg
## small_image_url
## 1 https://images.gr-assets.com/books/1447303603s/2767052.jpg
## 2 https://images.gr-assets.com/books/1474154022s/3.jpg
## 3 https://images.gr-assets.com/books/1361039443s/41865.jpg
## 4 https://images.gr-assets.com/books/1361975680s/2657.jpg
## 5 https://images.gr-assets.com/books/1490528560s/4671.jpg
## category age_group
## 1 Real-World Fiction adult
## 2 Fantasy/Adventure Fiction young
## 3 Real-World Fiction young
## 4 Fantasy/Adventure Fiction adult
## 5 Real-World Fiction young
# 2. See table dimensions (rows and columns)
dim(books) # returns (rows, columns)
## [1] 9447 24
# 3. List all column names
colnames(books)
## [1] "id" "book_id"
## [3] "work_id" "books_count"
## [5] "isbn" "isbn13"
## [7] "authors" "original_publication_year"
## [9] "original_title" "title"
## [11] "language_code" "average_rating"
## [13] "ratings_count" "work_ratings_count"
## [15] "work_text_reviews_count" "ratings_1"
## [17] "ratings_2" "ratings_3"
## [19] "ratings_4" "ratings_5"
## [21] "image_url" "small_image_url"
## [23] "category" "age_group"
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# 4. Search for duplicates
books <- distinct(books)
# Convert original_publication_year to integer
books$original_publication_year <- as.integer(books$original_publication_year)
# 5 Count NA and empty string values per column
na_empty_counts <- sapply(books, function(col) sum(is.na(col) | col == ""))
na_empty_df <- data.frame(Column = names(na_empty_counts), MissingOrEmpty = na_empty_counts)
na_empty_df <- na_empty_df[na_empty_df$MissingOrEmpty > 0, ] # Show only affected columns
# Print columns with NA or empty values
print("Columns with missing or empty values:")
## [1] "Columns with missing or empty values:"
print(na_empty_df)
## Column MissingOrEmpty
## isbn isbn 560
## isbn13 isbn13 458
## original_publication_year original_publication_year 6
## language_code language_code 1006
# 5.1. Check for full row duplicates
duplicate_rows <- books[duplicated(books), ]
# 5.2. Show the number of duplicate rows
num_duplicates <- nrow(duplicate_rows)
cat("Number of duplicate rows in the 'books' table:", num_duplicates, "\n")
## Number of duplicate rows in the 'books' table: 0
# 6. Create a new column 'work_rating' as the sum of ratings and text reviews
books$work_rating <- books$work_ratings_count + books$work_text_reviews_count
# Preview the new column
head(books[, c("work_ratings_count", "work_text_reviews_count", "work_rating")])
## work_ratings_count work_text_reviews_count work_rating
## 1 4942365 155254 5097619
## 2 4800065 75867 4875932
## 3 3916824 95009 4011833
## 4 3340896 72586 3413482
## 5 2773745 51992 2825737
## 6 2478609 140739 2619348
# 7. Remove columns: isbn, isbn13, image_url, small_image_url
books <- books %>%
select(-isbn, -isbn13, -image_url, -small_image_url, -language_code, -original_publication_year)
# Show cleaned table structure
str(books)
## 'data.frame': 9434 obs. of 19 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ book_id : int 2767052 3 41865 2657 4671 11870085 5907 5107 960 1885 ...
## $ work_id : int 2792775 4640799 3212258 3275794 245494 16827462 1540236 3036731 3338963 3060926 ...
## $ books_count : int 272 491 226 487 1356 226 969 360 311 3455 ...
## $ authors : chr "Suzanne Collins" "J.K. Rowling, Mary GrandPré" "Stephenie Meyer" "Harper Lee" ...
## $ original_title : chr "the hunger games" "harry potter and the philosopher's stone" "twilight" "to kill a mockingbird" ...
## $ title : chr "The Hunger Games (The Hunger Games, #1)" "Harry Potter and the Sorcerer's Stone (Harry Potter, #1)" "Twilight (Twilight, #1)" "To Kill a Mockingbird" ...
## $ average_rating : num 4.34 4.44 3.57 4.25 3.89 ...
## $ ratings_count : int 4780653 4602479 3866839 3198671 2683664 2346404 2071616 2044241 2001311 2035490 ...
## $ work_ratings_count : int 4942365 4800065 3916824 3340896 2773745 2478609 2196809 2120637 2078754 2191465 ...
## $ work_text_reviews_count: int 155254 75867 95009 72586 51992 140739 37653 44920 25112 49152 ...
## $ ratings_1 : int 66715 75504 456191 60427 86236 47994 46023 109383 77841 54700 ...
## $ ratings_2 : int 127936 101676 436802 117415 197621 92723 76784 185520 145740 86485 ...
## $ ratings_3 : int 560092 455024 793319 446835 606158 327550 288649 455042 458429 284852 ...
## $ ratings_4 : int 1481305 1156318 875073 1001952 936012 698471 665635 661516 716569 609755 ...
## $ ratings_5 : int 2706317 3011543 1355439 1714267 947718 1311871 1119718 709176 680175 1155673 ...
## $ category : chr "Real-World Fiction" "Fantasy/Adventure Fiction" "Real-World Fiction" "Fantasy/Adventure Fiction" ...
## $ age_group : chr "adult" "young" "young" "adult" ...
## $ work_rating : int 5097619 4875932 4011833 3413482 2825737 2619348 2234462 2165557 2103866 2240617 ...
library(dplyr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
# Reshape the 5 rating columns into long format
ratings_long <- books %>%
select(original_title, ratings_1, ratings_2, ratings_3, ratings_4, ratings_5) %>%
pivot_longer(
cols = starts_with("ratings_"),
names_to = "rating_level",
values_to = "count"
) %>%
mutate(rating_level = gsub("ratings_", "", rating_level)) # Clean up names
library(ggplot2)
library(scales)
## Warning: package 'scales' was built under R version 4.3.3
ggplot(ratings_long, aes(x = rating_level, y = count)) +
geom_jitter(alpha = 0.3, color = "#8e44ad", width = 0.2) +
scale_y_continuous(labels = comma, breaks = seq(0, max(ratings_long$count), by = 500000)) +
labs(title = "Scatter Plot of Book Ratings (1–5 Stars)",
x = "Rating Level", y = "Number of Ratings") +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(hjust = 0.5,face="bold"))
In the graph above, we understand that the majority of the review activity has been predominantly positive.
library(ggplot2)
ggplot(books, aes(x = category, fill = category)) +
geom_bar() +
labs(title = "Distribution of Real-World Fiction vs Fantasy Fiction",
x = "Book Category", y = "Number of Titles") +
theme_minimal() +
theme(legend.position = "none")
Here we see that the Real-World Fiction has more title of books than Fantasy/Adventure Fiction.
table(books$age_group, books$category)
##
## Fantasy/Adventure Fiction Real-World Fiction
## adult 735 4881
## young 1721 2097
4 & 5, are the same theory, only changes the graphical view.
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# Create a crosstab and convert it to a data frame
cross_tab <- table(books$age_group, books$category)
df_tab <- as.data.frame(cross_tab)
colnames(df_tab) <- c("age_group", "category", "count")
# Heatmap
ggplot(df_tab, aes(x = category, y = age_group, fill = count)) +
geom_tile() +
geom_text(aes(label = count), color = "white") +
scale_fill_gradient(low = "#add8e6", high = "#003366") +
labs(title = " Age Group vs Book Category Heatmap",
x = "Category", y = "Age Group") +
theme_minimal()
Antithesis –– A possible defense of the antithesis is the fact that the ‘young’ category also shows the same tendency, with a very small gap between the number of titles read in each genre. This can also be attributed to the fact that there are more distinct book titles available in the Real-World Fiction category, which implies that the same logic could apply to the adult category as well — and thus, the difference may not necessarily reflect true preference.
Argument for Thesis –– In this heatmap, we see that the majority of Real-World Fiction titles have been read by the ‘adult’ age group, which implies that this age group prefers this genre more. Furthermore, there is a noticeable gap between the number of adults reading Real-World Fiction and those reading the other genre, clearly surpassing the latter.
5.Mosaic Plot: Age Group vs Fiction Category
# install.packages("ggmosaic")
library(ggmosaic)
## Warning: package 'ggmosaic' was built under R version 4.3.3
ggplot(data = books) +
geom_mosaic(aes(x = product(age_group), fill = category), na.rm = TRUE) +
labs(title = "Mosaic Plot: Age Group vs Fiction Category",
x = "Age Group", y = "Proportion", fill = "Category") +
scale_fill_manual(values = c("#f7c6c7", "#f49ac2")) +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
## Warning: The `scale_name` argument of `continuous_scale()` is deprecated as of ggplot2
## 3.5.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `unite_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `unite()` instead.
## ℹ The deprecated feature was likely used in the ggmosaic package.
## Please report the issue at <https://github.com/haleyjeppson/ggmosaic>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Antithesis –– A possible defense of the antithesis is the fact that the ‘young’ category also shows the same tendency, with a very small gap between the number of titles read in each genre. This can also be attributed to the fact that there are more distinct book titles available in the Real-World Fiction category, which implies that the same logic could apply to the adult category as well — and thus, the difference may not necessarily reflect true preference.
Counterargument –– It’s true that the number of titles in the Real-World Fiction category is higher, but we can clearly see that the gap between the two genres in the adult category is significantly larger.
# Antithesis
ggplot(books, aes(x = age_group, y = average_rating, fill = category)) +
geom_boxplot() +
labs(title = "Average Rating by Age Group and Category",
x = "Age Group", y = "Average Rating") +
theme_minimal()
Antithesis argument—- Here we observe that the average ratings are higher for Fantasy/Adventure Fiction. This supports the idea that genre preference is not entirely dependent on age, but also on the personal phase the reader is going through, regardless of age. For example, an adult reader going through a psychologically difficult period may feel the need to read something lighter and easier to digest, which is why they may choose the Fantasy genre.
Counterargument –— The difference between the averages is very small, almost negligible. The results are also influenced by the presence of outliers — a few extremely low ratings are enough to bring down the overall average, even if the majority of ratings are high. Additionally, the library contains more books from the Real-World Fiction genre, meaning we have a larger sample size for this category compared to Fantasy. Moreover, the idea of preference is often tied to adults’ general perception of the genre, not necessarily because every book they read from that genre is rated highly by them.
Adults who read Real-World Fiction
sum(books$age_group == "adult" & books$category == "Real-World Fiction")
## [1] 4881
Adults who read Fantasy/Adventure Fiction
sum(books$age_group == "adult" & books$category == "Fantasy/Adventure Fiction")
## [1] 735
Comparison Plot
library(ggplot2)
# Done previously for thesis
# Step 1: Count the number of books in each group
adult_real <- sum(books$age_group == "adult" & books$category == "Real-World Fiction")
adult_fantasy <- sum(books$age_group == "adult" & books$category == "Fantasy/Adventure Fiction")
young_real <- sum(books$age_group == "young" & books$category == "Real-World Fiction")
young_fantasy <- sum(books$age_group == "young" & books$category == "Fantasy/Adventure Fiction")
# Step 2: Create a data frame
compare_df <- data.frame(
age_group = rep(c("Adult", "Young"), each = 2),
category = rep(c("Real-World Fiction", "Fantasy/Adventure Fiction"), 2),
count = c(adult_real, adult_fantasy, young_real, young_fantasy)
)
# Step 3: Create a grouped bar chart with pretty pink palette
ggplot(compare_df, aes(x = category, y = count, fill = age_group)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.7), width = 0.6) +
scale_fill_manual(values = c("Adult" = "#f7c6c7", "Young" = "#f49ac2")) +
labs(title = " Reader Preferences by Age Group and Category",
x = "Book Category", y = "Number of Books", fill = "Age Group") +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
) +
geom_text(aes(label = count),
position = position_dodge(width = 0.7),
vjust = -0.5, size = 4)
Adults give high ratings to Fantasy Fiction
mean(books$average_rating[books$age_group == "adult" & books$category == "Fantasy/Adventure Fiction"],
na.rm = TRUE)
## [1] 4.025306
Adults give lower ratings to Real-Life Fiction
mean(books$average_rating[books$age_group == "adult" & books$category == "Real-World Fiction"],
na.rm = TRUE)
## [1] 3.988982
Comparison Graph
#Thesis
library(ggplot2)
# Calculate mean ratings
avg_fantasy <- mean(books$average_rating[books$age_group == "adult" & books$category == "Fantasy/Adventure Fiction"], na.rm = TRUE)
avg_real <- mean(books$average_rating[books$age_group == "adult" & books$category == "Real-World Fiction"], na.rm = TRUE)
# Create a data frame
rating_df <- data.frame(
category = c("Fantasy/Adventure Fiction", "Real-World Fiction"),
avg_rating = c(avg_fantasy, avg_real)
)
# Plot with emerald tones
ggplot(rating_df, aes(x = category, y = avg_rating, fill = category)) +
geom_bar(stat = "identity", width = 0.5, show.legend = FALSE) +
scale_fill_manual(values = c("#50c878", "#2e8b57")) +
labs(title = " Average Rating by Adult Readers",
x = "Book Category", y = "Average Rating") +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
) +
ylim(0, 5) +
geom_text(aes(label = round(avg_rating, 2)), vjust = -0.5, size = 5)
Argument—Here we see that the average rating in both cases, when rounded to the nearest whole number, is approximately 4. This suggests that the Real-World Fiction titles read also have a high rating.
Young who read Fantasy/Adventure Fiction
mean(books$average_rating[books$age_group == "young" & books$category == "Fantasy/Adventure Fiction"],
na.rm = TRUE)
## [1] 4.021464
Young who read Real-World Fiction
mean(books$average_rating[books$age_group == "young" & books$category == "Real-World Fiction"],
na.rm = TRUE)
## [1] 3.993071
Combined Comparison Chart (Young vs Adult)
Create the Data Frame combined_df
# Create combined_df manually from your mean calculations
combined_df <- data.frame(
age_group = c("Young", "Young", "Adult", "Adult"),
category = c("Fantasy/Adventure Fiction", "Real-World Fiction",
"Real-World Fiction", "Fantasy/Adventure Fiction"),
avg_rating = c(
mean(books$average_rating[books$age_group == "young" & books$category == "Fantasy/Adventure Fiction"], na.rm = TRUE),
mean(books$average_rating[books$age_group == "young" & books$category == "Real-World Fiction"], na.rm = TRUE),
mean(books$average_rating[books$age_group == "adult" & books$category == "Real-World Fiction"], na.rm = TRUE),
mean(books$average_rating[books$age_group == "adult" & books$category == "Fantasy/Adventure Fiction"], na.rm = TRUE)
)
)
# Create grouped bar chart
# Thesis
ggplot(combined_df, aes(x = category, y = avg_rating, fill = age_group)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.7), width = 0.6) +
scale_fill_manual(values = c("Young" = "#cda4de", "Adult" = "#50c878")) + # violet and emerald
labs(title = "Average Rating Comparison by Age Group and Category",
x = "Book Category", y = "Average Rating", fill = "Age Group") +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
) +
ylim(0, 5) +
geom_text(aes(label = round(avg_rating, 2)),
position = position_dodge(width = 0.7),
vjust = -0.5, size = 4)
Argument—We observe the same trend here as well, considering the fact that the Real-World Fiction category also contains more outlier values in the ratings, which directly affects the result.
library(ggplot2)
library(dplyr)
library(scales)
#Thesis
# Step 1: Summarize
summary_df <- books %>%
group_by(age_group, category) %>%
summarise(total_work_rating = sum(work_rating, na.rm = TRUE)) %>%
mutate(label = paste(age_group, "–", category)) # custom x-axis labels
## `summarise()` has grouped output by 'age_group'. You can override using the
## `.groups` argument.
# Step 2: Radial chart with readable full labels
ggplot(summary_df, aes(x = label, y = total_work_rating, fill = age_group)) +
geom_bar(stat = "identity") +
coord_polar() +
labs(title = " Radial View: Work Rating by Age Group and Category",
x = "", y = "") +
scale_y_continuous(labels = comma) +
scale_fill_manual(values = c("adult" = "#50c878", "young" = "#cda4de")) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text.x = element_text(size = 6, color = "black")
)
Argument – Here it is quite clear that the Real-World Fiction category has received the highest number of reviews and ratings, and this trend has come particularly from the ‘adult’ age group, which has contributed a greater number of ratings.
library(dplyr)
#Thesis
# Summarize total ratings_1 and ratings_5 per group
rating_summary <- books %>%
group_by(age_group, category) %>%
summarise(
total_rating_1 = sum(ratings_1, na.rm = TRUE),
total_rating_5 = sum(ratings_5, na.rm = TRUE)
)
## `summarise()` has grouped output by 'age_group'. You can override using the
## `.groups` argument.
# View summary
print(rating_summary)
## # A tibble: 4 × 4
## # Groups: age_group [2]
## age_group category total_rating_1 total_rating_5
## <chr> <chr> <int> <int>
## 1 adult Fantasy/Adventure Fiction 1122821 21229147
## 2 adult Real-World Fiction 6241617 107814254
## 3 young Fantasy/Adventure Fiction 2472688 53555354
## 4 young Real-World Fiction 3246421 47939506
Use tidyr::pivot_longer() to reshape the data for easy plotting.
library(dplyr)
library(tidyr)
# Summarize ratings and drop grouping afterwards
rating_summary <- books %>%
group_by(age_group, category) %>%
summarise(
rating_1 = sum(ratings_1, na.rm = TRUE),
rating_5 = sum(ratings_5, na.rm = TRUE),
.groups = "drop" # fixes the warning
) %>%
pivot_longer(cols = c(rating_1, rating_5),
names_to = "rating_type",
values_to = "count") %>%
mutate(rating_type = recode(rating_type,
"rating_1" = "1 Star",
"rating_5" = "5 Star"))
Graphic for 1 Rating and 5 Rating , for each category, by each age_group. Thesis
library(ggplot2)
library(dplyr)
# Ensure correct order
rating_summary$rating_type <- factor(rating_summary$rating_type, levels = c("1 Star", "5 Star"))
ggplot(rating_summary, aes(x = "", y = count, fill = rating_type)) +
geom_bar(stat = "identity", width = 1, color = "white") +
coord_polar(theta = "y") +
facet_grid(age_group ~ category) +
scale_fill_manual(values = c("1 Star" = "#f08080", "5 Star" = "#7bd389")) +
labs(title = "Proportion of 1★ vs 5★ Ratings by Age Group and Category",
fill = "Star Rating") +
theme_void(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
strip.text = element_text(face = "bold", size = 11),
legend.title = element_text(face = "bold")
)
Argument – This ultimately confirms the validity of our thesis, as we see that the Real-World Fiction genre has received the highest number of 5-star ratings. Considering that this genre also had the highest number of reviews, this solidifies the conclusion that adults prefer this genre the most—perhaps because it resonates more with their lifestyle and way of thinking. It is also noteworthy that for the Real-World Fiction category, adults tend to give either 1-star or 5-star ratings. This explains the earlier graphical representations where we observed a similar average between the two genres, as this pattern is clearly influenced by outlier values.
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Interactive scatter plot
plot_ly(
data = books,
x = ~work_rating,
y = ~average_rating,
type = 'scatter',
mode = 'markers',
color = ~age_group,
symbol = ~category,
text = ~paste("Title:", original_title,
"<br>Age Group:", age_group,
"<br>Category:", category,
"<br>Avg Rating:", round(average_rating, 2),
"<br>Total Engagement:", work_rating),
hoverinfo = 'text',
marker = list(size = 6, opacity = 0.6)
) %>%
layout(
title = "Work Rating vs. Average Rating by Age Group and Category",
xaxis = list(title = "Total Work Rating"),
yaxis = list(title = "Average Rating", range = c(0, 5)),
legend = list(title = list(text = "<b>Age Group & Category</b>"))
)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
Colors = age group, shapes = category